In [1]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
import pickle

In [2]:
test_df = pd.read_excel("FinalTestDataset2024.xls")

In [3]:
# Read the RandomForestClassification model using pickle
with open("svr_test.pickle", "rb") as f:
    SVR = pickle.load(f)

In [4]:
# Find missing values in rows
missing_values_index = np.where(test_df == 999)[0]

# Find index where missing values are more than 4
drop_index = [
    index for index in set(missing_values_index)
    if (test_df.iloc[index] == 999).sum() >= 4
]

# Drop the rows where missing values are more than 4
test_df = test_df.drop(drop_index).reset_index(drop=True)


In [5]:
ID_data = test_df['ID']

# Drop the 'ID' from test_df
test_df.drop('ID', axis=1, inplace=True)

In [6]:
# Replace 999 with Nan
missing_values_index = np.where(test_df == 999)
new_df = test_df.replace(999, np.NaN)

# Ð˜ IterativeImputer
multivariate_imp = IterativeImputer(random_state=42)
multi_imputed_array = multivariate_imp.fit_transform(new_df)

# Round imputed values
for row, col in zip(*missing_values_index):
    multi_imputed_array[row, col] = np.round(multi_imputed_array[row, col])

# Create a DataFrame from the imputed array, with the columns and index of original dataframe
multi_imputed_df = pd.DataFrame(multi_imputed_array, columns=test_df.columns)


In [7]:
# Feature which we found using feature selection in the training dataset
feature_selection_list = ['original_shape_Maximum2DDiameterColumn', 'original_firstorder_90Percentile', 'original_glcm_JointEntropy', 'original_glcm_Imc1', 'original_gldm_SmallDependenceLowGrayLevelEmphasis', 'original_firstorder_Minimum', 'original_glrlm_RunPercentage', 'original_firstorder_Variance', 'ChemoGrade', 'original_shape_LeastAxisLength', 'original_shape_Maximum2DDiameterSlice', 'TumourStage', 'original_shape_Sphericity', 'original_glszm_SizeZoneNonUniformity', 'original_firstorder_Range', 'original_glcm_SumEntropy', 'original_firstorder_RootMeanSquared', 'original_shape_Maximum2DDiameterRow', 'original_glcm_JointEnergy', 'Gene', 'original_gldm_DependenceNonUniformityNormalized', 'original_glszm_SmallAreaHighGrayLevelEmphasis', 'original_shape_Maximum3DDiameter', 'original_firstorder_MeanAbsoluteDeviation', 'original_shape_MinorAxisLength', 'original_glszm_ZoneEntropy', 'original_glcm_MaximumProbability', 'original_firstorder_10Percentile', 'original_gldm_LargeDependenceHighGrayLevelEmphasis', 'original_firstorder_Maximum', 'original_glszm_SizeZoneNonUniformityNormalized', 'ER', 'original_firstorder_Kurtosis', 'HER2', 'original_firstorder_RobustMeanAbsoluteDeviation', 'original_shape_MajorAxisLength', 'original_shape_Elongation', 'original_glszm_LowGrayLevelZoneEmphasis', 'Age', 'original_glcm_SumSquares', 'original_firstorder_Skewness', 'original_glrlm_ShortRunHighGrayLevelEmphasis', 'original_gldm_SmallDependenceHighGrayLevelEmphasis', 'original_firstorder_InterquartileRange']

feature_selected = multi_imputed_df[feature_selection_list]


In [8]:
scaler = StandardScaler()
Xs_train = scaler.fit_transform(feature_selected)

In [9]:
predictions = SVR.predict(Xs_train)


In [12]:
target_df = pd.DataFrame({'ID': ID_data, 'RelapseFreeSurvival (outcome)': predictions})
target_df.to_csv('RFSPrediction.csv', index=False)